(I) Background
- Instructor: Peng Wang, AVP, Head of Data Science - Operation & Fraud Detection at MassMutual Financial Group (Fall 2016)
- The data set used in this project is gapminder from Bioconnector. The data set includes the data of Life Expectancy, Population, GDP per Capita of all the countries of each continent from 1952 to 2007 with an interval of 5 years.
- Download Data
- Multiple visualizations of the data set (Section II) and an interactive web application (Section III) were made in this project report.
(II) Data Analysis
gapminder_url = "https://bioconnector.github.io/workshops/data/gapminder.csv"
gapminder = read_csv(gapminder_url)
render_df = function(df) {
row_size = df %>% dim() %>% magrittr::extract(1)
output_table = df %>%
kable(align = "c") %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive",
"condensed"),
fixed_thead = TRUE,
full_width = FALSE) %>%
row_spec(0:row_size, extra_css = "vertical-align: middle;")
return(output_table)
}
1. Number of countries per continent
- How many unique countries are represented per continent?
df_1 = gapminder %>%
select(continent, country) %>%
group_by(continent) %>%
summarise(country = country %>% n_distinct()) %>%
rename(Continent = continent,
Country = country)
df_1 %>% render_df()
|
Continent
|
Country
|
|
Africa
|
52
|
|
Americas
|
25
|
|
Asia
|
33
|
|
Europe
|
30
|
|
Oceania
|
2
|
ggplot(data = df_1)+
geom_col(mapping = aes(Continent, Country), width = 0.5) +
ggtitle("Country Number of Each Continent") +
theme(plot.title = element_text(size = 20, hjust = 0.5))

2. According to the data available, what was the average Life Expectancy across each continent from 1952 to 2007?
df_2=tapply(gapminder$lifeExp,list(gapminder$continent,gapminder$year),mean)
df_2=data.frame(t(df_2))
Year=seq(from=1952,to=2007,by=5)
df_2=cbind(Year,df_2)
df_2=gather(data=df_2,continent,Average.lifeExp,-Year)
p1=ggplot(data=df_2,aes(x=Year,y=Average.lifeExp,color=continent))+
geom_point()+
geom_line()+
ggtitle("Average Life Expectancy")+
xlab("Year")+
ylab("Life Expectancy (Years)")+
theme(plot.title=element_text(size=20,hjust=0.5))
p1

2.1 Life Expectancy for Every Countries in Americas
df_3=gapminder%>%filter(continent=="Americas")
df_3=tapply(df_3$lifeExp,list(df_3$country,df_3$year),mean)
df_3=t(data.frame(df_3))
row.names(df_3)=seq(1952,2007,5)
matplot(seq(1952,2007,5),df_3,type="l",lty=1,xlab="Years",ylab="Life Expectancy")

2.2 Countries that have the longest average Life Expectancy in the world
df_4=sort(tapply(gapminder$lifeExp,gapminder$country,mean),decreasing=TRUE)[1:5]
df_4=as.data.frame(df_4)
colnames(df_4)="Average Life Expectancy"
kable(df_4,align="c")
|
|
Average Life Expectancy
|
|
Iceland
|
76.51142
|
|
Sweden
|
76.17700
|
|
Norway
|
75.84300
|
|
Netherlands
|
75.64850
|
|
Switzerland
|
75.56508
|
2.3 Countries that have the shortest average Life Expectancy in the world
df_5=sort(tapply(gapminder$lifeExp,gapminder$country,mean),decreasing=FALSE)[1:5]
df_5=as.data.frame(df_5)
colnames(df_5)="Average Life Expectancy"
kable(df_5,align="c")
|
|
Average Life Expectancy
|
|
Sierra Leone
|
36.76917
|
|
Afghanistan
|
37.47883
|
|
Angola
|
37.88350
|
|
Guinea-Bissau
|
39.21025
|
|
Mozambique
|
40.37950
|
3. According to the data available, what was the average Population across each continent from 1952 to 2007?
df_6=tapply(gapminder$pop,list(gapminder$continent,gapminder$year),mean)
df_6=data.frame(t(df_6))
Year=seq(from=1952,to=2007,by=5)
df_6=cbind(Year,df_6)
df_6=gather(data=df_6,continent,pop,-Year)
p2=ggplot(data=df_6,aes(x=Year,y=pop,color=continent))+
geom_point()+
geom_line()+
ggtitle("Average Population")+
xlab("Year")+
ylab("Population")+
theme(plot.title=element_text(size=20,hjust=0.5))
p2

3.1 Population for Every Countries in Americas
df_7=gapminder%>%filter(continent=="Americas")
df_7=tapply(df_7$pop,list(df_7$country,df_7$year),mean)
df_7=t(data.frame(df_7))
row.names(df_7)=seq(1952,2007,5)
matplot(seq(1952,2007,5),df_7,type="l",lty=1,xlab="Years",ylab="Population")

4. According to the data available, what was the average GDP per Capita across each continent from 1952 to 2007?
df_8=tapply(gapminder$gdpPercap,list(gapminder$continent,gapminder$year),mean)
df_8=data.frame(t(df_8))
Year=seq(from=1952,to=2007,by=5)
df_8=cbind(Year,df_8)
df_8=gather(data=df_8,continent,gdpPercap,-Year)
p3=ggplot(data=df_8,aes(x=Year,y=gdpPercap,color=continent))+
geom_point()+
geom_line()+
ggtitle("Average GDP per Capita")+
xlab("Year")+
ylab("GDP per Capita")+
theme(plot.title=element_text(size=20,hjust=0.5))
p3

4.1 GDP Per Capita for Every Countries in Americas
df_9=gapminder%>%filter(continent=="Americas")
df_9=tapply(df_9$gdpPercap,list(df_9$country,df_9$year),mean)
df_9=t(data.frame(df_9))
row.names(df_9)=seq(1952,2007,5)
matplot(seq(1952,2007,5),df_9,type="l",lty=1,xlab="Years",ylab="GDP Per Capita")
legend("topleft",legend=c("Top 1: America","Top 2: Canada"),lty=1,col=3)

(III) Interactive Web Application
library(shiny)
UI=fluidPage(
titlePanel("World Facts"),
sidebarLayout(
sidebarPanel(
selectInput(inputId="select",
label="Choose a country",
choices=unique(gapminder$country)
),
selectInput(inputId="object",
label="Choose from the following",
choices=c("Life Expectancy","Population","GDP per Capita")
)
),
mainPanel(plotOutput(outputId="figure"),
tableOutput(outputId="data")
)
)
)
SERVER=function(input,output){
f=function(temp){
result=subset(gapminder,gapminder$country==temp)
return(result)
}
output$figure=renderPlot({
country.name=reactive(input$select)
dat=f(country.name())
if (input$object=="Life Expectancy"){
plot(dat$lifeExp~dat$year,xlim=c(1950,2010),xlab="Year",ylab="Life Expectancy",lty=2,type="l",main=c("Life Expectancy of ",country.name()))
points(dat$lifeExp~dat$year,pch=19,col=1)
}
if (input$object=="Population"){
plot(dat$pop~dat$year,xlim=c(1950,2010),xlab="Year",ylab="Population",lty=2,type="l",main=c("Population of ",country.name()))
points(dat$pop~dat$year,pch=19,col=1)
}
if (input$object=="GDP per Capita"){
plot(dat$gdpPercap~dat$year,xlim=c(1950,2010),xlab="Year",ylab="GDP per Capita",lty=2,type="l",main=c("GDP per Capita of ",country.name()))
points(dat$gdpPercap~dat$year,pch=19,col=1)
}
})
output$data=renderTable(colnames=T,{
country.name=reactive(input$select)
dat=f(country.name())
if (input$object=="Life Expectancy"){
temp=c()
temp$year=dat$year
temp$`Life Expectancy`=dat$lifeExp
return(temp)
}
if (input$object=="Population"){
temp=c()
temp$year=dat$year
temp$pop=dat$pop
return(temp)
}
if (input$object=="GDP per Capita"){
temp=c()
temp$year=dat$year
temp$`GDP per Capita`=dat$gdpPercap
return(temp)
}
})
}
shinyApp(ui=UI,server=SERVER)